import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json
import statsmodels.api as sm
import collections
import seaborn as sn
import missingno as msno
from itables import show
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans,DBSCAN, AgglomerativeClustering
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFromModel
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_score, recall_score, f1_score, r2_score
from sklearn.metrics import precision_recall_curve, auc, average_precision_score, plot_precision_recall_curve,confusion_matrix
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
In the BDA 502 term project, I anlaysed features of the tracks and tried to understand correlation between features and popularity. For this purpose, I obtained 80.000 Techno tracks and their features using Spotify Web API. Basically, Spotify presents various endpoints for retrieving information in the API. Artists, Albums, Playlists, Users Profile are some of the examples of these endpoints. More information can be found in references. For this project, mainly 3 types of information were obtained:
In this project, my main purpose is predicting binary popularity feature using supervised machine learning methods and selecting the best algorithm for popularity prediction.
with open('D:\\Int. ML\\Dönem Ödevi\\track_data.json', encoding="utf8", errors='ignore') as f:
data = json.load(f)
df = pd.DataFrame(data)
display(df.describe())
display(df.info())
display(df.head())
msno.matrix(df)
df.isna().sum()
# Selecting meaningfull columns for prediction
slicing = list(range(2, 14)) + [16,20,24,28,29]
tracks=df.iloc[:,slicing]
# Cleaning the tracks which have 0 popularity and 0 tempo (assumed to be missing data)
tracks = tracks[np.logical_and(tracks.Track_Popularity > 0,tracks.tempo >0)].reset_index(drop = True)
tracks
To solve this prediction problem with classification approach, I splitted the Track Puplarity feature into two binary categories (popular, not popular) by using median because popularity column is highly skewed. Thus, we can increase the accuracy of the machine learning models.
sns.set(style="darkgrid")
plt.figure(figsize=(16, 8))
sns.kdeplot(tracks.Track_Popularity,shade = True, linewidth = 5)
#Binning Track Popularity using median
ftracks = tracks.copy()
ftracks["Popular"] = [1 if ftracks.loc[i,"Track_Popularity"] > np.median(ftracks.Track_Popularity) else 0 for i in ftracks.index]
display(ftracks)
from wordcloud import WordCloud
plt.figure(figsize=(14, 13))
text = " ".join(x for i in df.Artist_Genres for x in i)
wordcloud = WordCloud(background_color="black").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
df.sort_values(by=['Track_Popularity'],ascending = False).loc[:,["Track_Name","Artist_Name","Track_Popularity"]].head(n=10)
df.loc[:,["Artist_Name","Artist_Popularity"]].groupby(["Artist_Name"]).max().sort_values(by=['Artist_Popularity'],ascending = False).head(n=10)
tr = df.loc[:,["Artist_Name","Artist_Popularity"]].groupby(["Artist_Name"]).count().rename(columns={'Artist_Popularity': 'Count'})
tr = tr.sort_values(by=['Count'],ascending = False).head(n=15).reset_index()
import plotly.express as px
fig = px.bar(tr, x='Artist_Name', y='Count')
fig.show()
sns.set(style="whitegrid")
plt.figure(figsize=(16, 8))
sns.kdeplot(ftracks.tempo,shade = True,linewidth = 5)
plt.figure(figsize=(16, 8))
sns.kdeplot(ftracks.danceability,shade = True, linewidth = 5)
plt.figure(figsize=(16, 8))
sns.kdeplot(ftracks.acousticness,shade = True, linewidth = 5)
plt.figure(figsize=(16, 8))
sns.kdeplot(ftracks.instrumentalness,shade = True, linewidth = 5)
plt.figure(figsize=(16, 8))
sns.kdeplot(ftracks.loudness,shade = True, linewidth = 5)
plt.figure(figsize=(16, 8))
display(sns.kdeplot(ftracks.valence,shade = True, linewidth = 5))
plt.figure(figsize=(16, 8))
display(sns.kdeplot(ftracks.Artist_Popularity,shade = True, linewidth = 5))
slicer = list(range(1, 12)) + [15,16,17]
plt.figure(figsize=(16, 8))
sns.set(style="ticks",color_codes=True)
sns.pairplot(ftracks.iloc[:,slicer], hue="Popular")
cor = ftracks.corr()
plt.figure(figsize=(14, 14))
ax = sns.heatmap(
cor,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True, annot = True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
)
ax.set_ylim(len(cor)+0.5, -0.5);
cor["Popular"]
Corelation between features Popular cloumn is diplayed in the heatmap and table above. There isn't any high colineratiy seen between features (>0.8).
Mainly, 5 supervised classification method is planned to be use for predicting the track popularity. These methods are Logistic Regression, Decision Trees, Support Vector Machine, Random Forests and XGBoost Classifier. All supervised machine learning algorithms this project went through the same steps. This steps can be summarised as follows:
X_train_s, X_test_s) X_train, X_test) and selecting features according to their importance for given algorithm#Random state definition
rnd = 50
#Shuffling data
shf = ftracks.sample(frac=1,random_state = rnd).reset_index(drop=True)
#Creating features and labels
X = shf.drop(['Track_Popularity',"Popular"],axis=1)
y = shf.loc[:,"Popular"]
#Anolmaly detection using IsolationForest
clf = IsolationForest(max_samples=100, random_state=rnd, contamination=.1)
clf.fit(X)
y_pred_train = clf.predict(X)
X_rem = X[np.where(y_pred_train == 1, True, False)]
y_rem = y[X_rem.index]
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_rem, y_rem, test_size=0.25, random_state=rnd)
#Scaling Data
scaler = MinMaxScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
#Final Data
X_train = pd.DataFrame(X_train_s, columns=X.columns)
X_test = pd.DataFrame(X_test_s, columns=X.columns)
print(X_train.shape)
For feature selection, basically, I tried 2 methods. Firstly, I analysed the Chi-Square values of the features and selected features based on Chi-Square p values and correlation matrix. Secondly, I tried to find feature importance for each method and selected the features according to their importance in the given method.
selector_chi = SelectKBest(chi2, k=X_train.shape[1]).fit(X_train, y_train)
selection_chi = {"Columns": X_train.columns.tolist(),
"Scores" : selector_chi.scores_,
"P_values" : selector_chi.pvalues_ }
selection_chi = pd.DataFrame(selection_chi).sort_values(by='Scores', ascending=False).reset_index()
display(selection_chi)
#Selecting columns which p-value
threshold = 0.4
selected_features = selection_chi[selection_chi.P_values<threshold].Columns.tolist()
print("For defined threshold, {} features are selected. \n These features are: {}".format(len(selected_features),selected_features))
In the corelation section, it seems that loudness have higher corelation than some of the features which selected in the above. Thus, I decided to add these feature to selected columns. In the analysis, we waw that adding loudness feature increases the accuracy.
# New features
X_train_s = X_train.loc[:,selected_features + ['loudness']]
X_test_s = X_test.loc[:,selected_features + ['loudness']]
models = LogisticRegression(random_state=rnd,n_jobs=-1)
cv = cross_validate(models,X_train_s,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test_s,y_test)))
high_score=0
nof=0
score_list =[]
cv_list=[]
for n in range(1,len(X_train.columns)+1):
model = LogisticRegression(random_state=rnd,n_jobs=-1)
rfe = RFE(model,n)
X_train_rfe = rfe.fit_transform(X_train,y_train)
X_test_rfe = rfe.transform(X_test)
cv = cross_validate(model,X_train_rfe,y_train,cv = 3, n_jobs=-1, return_estimator=True)
score = cv["estimator"][0].score(X_test_rfe,y_test)
cv_s = np.mean(cv['test_score'])
score_list.append(score)
cv_list.append(cv_s)
if(cv_s>high_score):
high_score = cv_s
nof = n
test_s = score
sup = rfe.get_support(indices=True)
print("Optimum number of features: {}".format(nof))
print('Selected features are:',X_train.iloc[:,sup].columns.tolist(),"\n")
print('Mean training accuracy with {} features: {:.6f}'.format(nof, high_score))
print("Test accuracy with {} features: {:.6f}".format(nof, test_s))
Both training and test accuracy of RFE features are higher than chi-square features. Thus, I selected the RFE features as independent variables for hyperparameter tuning process.
#New dataset after RFE
X_train_log = X_train.iloc[:,sup]
X_test_log = X_test.iloc[:,sup]
#Hyperparameter tuning
param_log = { "penalty": ["l1", "l2", "elasticnet"],
"C": [0.01,0.1,1,5,10],
"tol": [1e-3,1e-2,1e-1],
"solver": ["newton-cg","lbfgs","liblinear","sag"],
"max_iter": [50,100] }
log_reg = LogisticRegression(random_state=rnd,n_jobs=-1)
clf_log = GridSearchCV(log_reg,param_log, cv=5, n_jobs = -1)
clf_log.fit(X_train_log, y_train)
print("Tuned Logistic Regression Classification Parameters: {}".format(clf_log.best_params_))
print("Mean of the cv scores is {:.6f}".format(clf_log.best_score_))
print("Test Score {:.6f}".format(clf_log.score(X_test_log,y_test)))
print("Seconds used for refitting the best model on the train dataset: {:.6f}".format(clf_log.refit_time_))
#Confusion Matrix
y_pred_log = clf_log.predict(X_test_log)
conf = confusion_matrix(y_test,y_pred_log)
plt.figure(figsize=(16, 8))
ax = sns.heatmap(conf, annot=True,cmap='Blues',fmt='g',annot_kws={"size": 13})
# labels, title and ticks
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
ax.set_title('Confusion Matrix of Logistic Regression',fontsize = 20)
ax.xaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
ax.yaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
plt.show()
y_pred_prop = clf_log.predict_proba(X_test_log)[:,1]
fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_prop)
roc_auc_log = auc(fpr_log, tpr_log)
sns.set_style("white")
plt.figure(figsize=(14, 10))
plt.plot(fpr_log, tpr_log, color='darkorange',
label='ROC curve (area = %0.2f)' % roc_auc_log)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()
lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_pred_prop)
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.figure(figsize=(14, 10))
plt.plot(lr_recall, lr_precision, color='red',
label='Logistic' % roc_auc_log)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.xlabel('Recall',fontsize=18,labelpad =10)
plt.ylabel('Precision',fontsize=18)
plt.title('Logistic Regression Precision-Recall Curve',fontsize=22).set_position([.5, 1.02])
plt.show()
train_ac_log = clf_log.best_score_
accuracy_log = accuracy_score(y_test,y_pred_log)
precision_log = precision_score(y_test,y_pred_log)
recall_log = recall_score(y_test,y_pred_log)
f1_score_log = f1_score(y_test,y_pred_log)
time_log = clf_log.refit_time_
logistic_result = [train_ac_log,accuracy_log,precision_log,recall_log,f1_score_log,roc_auc_log,time_log]
#Accuracy table generation
res_col = ['Train Accuracy','Test Accuracy','Precision','Recall','F1 Score','AUC','Fitting Time']
log_result = np.array(logistic_result).reshape(1,-1)
f_log = pd.DataFrame(log_result,index=['Logistic Regression'],columns = res_col)
f_log
models = DecisionTreeClassifier(random_state = rnd)
cv = cross_validate(models,X_train_s,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test_s,y_test)))
With all data:
models = DecisionTreeClassifier(random_state = rnd)
cv = cross_validate(models,X_train,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test,y_test)))
fi_dt = pd.DataFrame({'Feature':X_train.columns,
'Importance':cv["estimator"][0].feature_importances_}).sort_values(by = "Importance",ascending = True).reset_index(drop = True)
fig = px.bar(fi_dt, x='Importance', y='Feature',orientation='h',color='Importance')
fig.show()
val_t = 0
val_s = 0
fi_dt = fi_dt.sort_values(by = "Importance",ascending = False)
for i in fi_dt.index:
cols = fi_dt.Feature[0:i+1]
models = DecisionTreeClassifier(random_state = rnd)
cv = cross_validate(models, X_train.loc[:,cols] ,y_train,cv = 3, n_jobs=-1, return_estimator=True)
tr = np.mean(cv['test_score'])
ts = cv["estimator"][0].score(X_test.loc[:,cols],y_test)
if tr > val_t:
val_t = tr
names = cols.tolist()
val_s = ts
nof = i + 1
print("Optimum number of features: {}".format(nof))
print('Selected features are:',names,"\n")
print('Mean training accuracy with {} features: {:.6f}'.format(nof, val_t))
print("Test accuracy with {} features: {:.6f}".format(nof, val_s))
Both training and test accuracy of selected features by their importance are higher than chi-square features. Thus, I selected the 2 columns as independent variables for hyperparameter tuning process.
X_train_dt = X_train.loc[:,names]
X_test_dt = X_test.loc[:,names]
param_dt = {"criterion":["gini", "entropy"],
"splitter" : ["best", "random"],
"min_samples_split": [2,10,30,50,100,200,500],
"ccp_alpha": [0,1e-3,0.01,0.1,1,5],
}
dt = DecisionTreeClassifier(random_state = rnd)
clf_dt = GridSearchCV(dt,param_dt, cv=5, n_jobs = -1)
clf_dt.fit(X_train_dt, y_train)
print("Tuned Decision Tree Classification Parameters: {}".format(clf_dt.best_params_))
print("Mean of the cv scores is {:.6f}".format(clf_dt.best_score_))
print("Test Score {:.6f}".format(clf_dt.score(X_test_dt,y_test)))
print("Seconds used for refitting the best model on the whole dataset: {:.6f}".format(clf_dt.refit_time_))
import os
from sklearn.tree import export_graphviz
os.environ["PATH"] += ';' + r'C:\Users\Dell\Anaconda3\Library\bin\graphviz'
import graphviz
dot_data = export_graphviz(clf_dt.best_estimator_, out_file=None,
feature_names=names,
class_names=["Not Popular","Popular"],
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
y_pred_dt = clf_dt.predict(X_test_dt)
#Confusion Matrix
conf = confusion_matrix(y_test,y_pred_dt)
plt.figure(figsize=(16, 8))
ax = sns.heatmap(conf, annot=True,cmap='Blues',fmt='g',annot_kws={"size": 13})
# labels, title and ticks
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
ax.set_title('Confusion Matrix of Decision Tree',fontsize = 20).set_position([.5, 1.02])
ax.xaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
ax.yaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
plt.show()
#Probabilty Calculation
y_pred_prop = clf_dt.predict_proba(X_test_dt)[:,1]
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_prop)
roc_auc_dt = auc(fpr_dt, tpr_dt)
plt.figure(figsize=(14, 10))
plt.plot(fpr_dt, tpr_dt, color='darkorange',
label='ROC curve (area = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()
dt_precision, dt_recall, _ = precision_recall_curve(y_test, y_pred_dt)
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.figure(figsize=(14, 10))
plt.plot(dt_recall, dt_precision, color='red',
label='Decision Tree' % roc_auc_dt)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.xlabel('Recall',fontsize=18,labelpad =10)
plt.ylabel('Precision',fontsize=18)
plt.title('Decision Tree Precision-Recall Curve',fontsize=22).set_position([.5, 1.02])
plt.show()
train_ac_dt = clf_dt.best_score_
accuracy_dt = accuracy_score(y_test,y_pred_dt)
precision_dt = precision_score(y_test,y_pred_dt)
recall_dt = recall_score(y_test,y_pred_dt)
f1_score_dt = f1_score(y_test,y_pred_dt)
time_dt = clf_dt.refit_time_
dt_result = [train_ac_dt,accuracy_dt,precision_dt,recall_dt,f1_score_dt,roc_auc_dt,time_dt]
dt_result = np.array(dt_result).reshape(1,-1)
f_dt = pd.DataFrame(dt_result,index=['Decision Tree'],columns = res_col)
f_dt
models = SVC(random_state=rnd)
cv = cross_validate(models,X_train_s,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test_s,y_test)))
For feature selection, chi-square values of the columns were considered.
val_svc = 0
test_svc = 0
for i in range(X_train_s.shape[1]):
cols = X_train_s.columns[0:i+1]
models = SVC(random_state=rnd)
cv = cross_validate(models,X_train_s.loc[:,cols],y_train,cv = 3, n_jobs=-1, return_estimator=True)
svc_tr = np.mean(cv['test_score'])
svc_ts = cv["estimator"][0].score(X_test.loc[:,cols],y_test)
if svc_tr > val_svc:
val_svc = svc_tr
names_svc = cols.tolist()
test_svc = svc_ts
nof = i + 1
print("Optimum number of features: {}".format(nof))
print('Selected features are:',names_svc,"\n")
print('Mean training accuracy with {} features: {:.6f}'.format(nof, val_svc))
print("Test accuracy with {} features: {:.6f}".format(nof, test_svc))
Due to the CPU capacity, I used RandomizedSearchCV to optimize Support Vector Classifier hyperparameters.
X_train_svc = X_train.loc[:,names_svc]
X_test_svc = X_test.loc[:,names_svc]
param_svc = {"C": [0.0001,0.001,0.01,0.1,1,10],
"kernel": ['linear', "poly", "rbf"],
'gamma' : ["scale", "auto"],
"tol": [1e-5,1e-4,1e-3,1e-2,1e-1]}
svc = SVC(random_state=rnd)
clf_svc = RandomizedSearchCV(svc, param_svc, n_iter=15, cv=3, n_jobs = -1)
clf_svc.fit(X_train_svc, y_train)
print("Tuned Support Vector Classification Parameters: {}".format(clf_svc.best_params_))
print("Mean of the cv scores is {:.6f}".format(clf_svc.best_score_))
print("Test Score {:.6f}".format(clf_svc.score(X_test_svc,y_test)))
print("Seconds used for refitting the best model on the train dataset: {:.6f}".format(clf_svc.refit_time_))
y_pred_svc = clf_svc.predict(X_test_svc)
#Confusion Matrix
conf = confusion_matrix(y_test,y_pred_svc)
plt.figure(figsize=(16, 8))
ax = sns.heatmap(conf, annot=True,cmap='Blues',fmt='g',annot_kws={"size": 13})
# labels, title and ticks
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
ax.set_title('Confusion Matrix of SVC',fontsize = 20).set_position([.5, 1.02])
ax.xaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
ax.yaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
plt.show()
y_score_svc = clf_svc.decision_function(X_test_svc)
fpr_svc, tpr_svc, _ = roc_curve(y_test, y_score_svc)
roc_auc_svc = auc(fpr_svc, tpr_svc)
plt.figure(figsize=(14, 10))
plt.plot(fpr_svc, tpr_svc, color='darkorange',
label='ROC curve (area = %0.2f)' % roc_auc_svc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()
svc_precision, svc_recall, _ = precision_recall_curve(y_test, y_pred_svc)
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.figure(figsize=(14, 10))
plt.plot(svc_recall, svc_precision, color='red',
label='SVC' % roc_auc_svc)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.xlabel('Recall',fontsize=18,labelpad =10)
plt.ylabel('Precision',fontsize=18)
plt.title('SVC Precision-Recall Curve',fontsize=22).set_position([.5, 1.02])
plt.show()
train_ac_svc = clf_svc.best_score_
accuracy_svc = accuracy_score(y_test,y_pred_svc)
precision_svc = precision_score(y_test,y_pred_svc)
recall_svc = recall_score(y_test,y_pred_svc)
f1_score_svc = f1_score(y_test,y_pred_svc)
time_svc = clf_svc.refit_time_
svc_result = [train_ac_svc,accuracy_svc,precision_svc,recall_svc,f1_score_svc,roc_auc_svc,time_svc]
svc_result = np.array(svc_result).reshape(1,-1)
f_svc=pd.DataFrame(svc_result,index=['SVM Classifier'],columns = res_col)
f_svc
models = RandomForestClassifier(random_state = rnd)
cv = cross_validate(models,X_train_s,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test_s,y_test)))
from sklearn.ensemble import RandomForestClassifier
selector = SelectFromModel(RandomForestClassifier()).fit(X_train, y_train)
X_rnd_train = selector.transform(X_train)
X_rnd_test = selector.transform(X_test)
print("Selected Columns:",X.loc[:,selector.get_support()].columns.tolist(),"\n")
models = RandomForestClassifier(random_state = rnd)
cv = cross_validate(models,X_rnd_train,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_rnd_test,y_test)))
param_rnd = {"min_samples_split" : [2, 50, 150, 500],
"criterion" : ["gini","entropy"],
"ccp_alpha": [0,1e-4,1e-3,0.01,0.1] }
ran_forest = RandomForestClassifier(random_state = rnd)
clf_rnd = GridSearchCV(ran_forest, param_rnd, cv=3, n_jobs = -1)
clf_rnd.fit(X_rnd_train, y_train)
print("Tuned Random Forest Classification Parameters: {}".format(clf_rnd.best_params_))
print("Mean of the cv scores is {:.6f}".format(clf_rnd.best_score_))
print("Test Score {:.6f}".format(clf_rnd.score(X_rnd_test,y_test)))
print("Seconds used for refitting the best model on the train dataset: {:.6f}".format(clf_rnd.refit_time_))
y_pred_rnd = clf_rnd.predict(X_rnd_test)
#Confusion Matrix
conf = confusion_matrix(y_test,y_pred_rnd)
plt.figure(figsize=(16, 8))
ax = sns.heatmap(conf, annot=True,cmap='Blues',fmt='g',annot_kws={"size": 13})
# labels, title and ticks
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
ax.set_title('Confusion Matrix of Random Forests',fontsize = 20).set_position([.5, 1.02])
ax.xaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
ax.yaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
plt.show()
y_pred_prop = clf_rnd.predict_proba(X_rnd_test)[:,1]
fpr_rnd, tpr_rnd, _ = roc_curve(y_test, y_pred_prop)
roc_auc_rnd = auc(fpr_rnd, tpr_rnd)
plt.figure(figsize=(14, 10))
plt.plot(fpr_rnd, tpr_rnd, color='darkorange',
label='ROC curve (area = %0.2f)' % roc_auc_rnd)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()
rnd_precision, rnd_recall, _ = precision_recall_curve(y_test, y_pred_rnd)
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.figure(figsize=(14, 10))
plt.plot(rnd_recall, rnd_precision, color='red',
label='SVC' % roc_auc_rnd)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.xlabel('Recall',fontsize=18,labelpad =10)
plt.ylabel('Precision',fontsize=18)
plt.title('Random Forests Precision-Recall Curve',fontsize=22).set_position([.5, 1.02])
plt.show()
train_ac_rnd = clf_rnd.best_score_
accuracy_rnd = accuracy_score(y_test,y_pred_rnd)
precision_rnd = precision_score(y_test,y_pred_rnd)
recall_rnd = recall_score(y_test,y_pred_rnd)
f1_score_rnd = f1_score(y_test,y_pred_rnd)
time_rnd = clf_rnd.refit_time_
rnd_result = [train_ac_rnd,accuracy_rnd,precision_rnd,recall_rnd,f1_score_rnd,roc_auc_rnd,time_rnd]
rnd_result = np.array(rnd_result).reshape(1,-1)
f_rnd = pd.DataFrame(rnd_result,index=['Random Forest'],columns = res_col)
f_rnd
from xgboost import XGBClassifier, plot_importance
models = XGBClassifier(seed=rnd)
cv = cross_validate(models,X_train_s,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test_s,y_test)))
from xgboost import XGBClassifier
models = XGBClassifier(seed=rnd,n_jobs =-1)
cv = cross_validate(models,X_train,y_train,cv = 3, n_jobs=-1, return_estimator=True)
print("Mean training accuracy: {}".format(np.mean(cv['test_score'])))
print("Test accuracy: {}".format(cv["estimator"][0].score(X_test,y_test)))
fi_xgb = pd.DataFrame({'Feature':X_train.columns,
'Importance':cv["estimator"][0].feature_importances_}).sort_values(by = "Importance",ascending = True).reset_index(drop = True)
fig = px.bar(fi_xgb, x='Importance', y='Feature',orientation='h',color='Importance')
fig.show()
xgb_t = 0
xgb_s = 0
cv_list = list()
test_list = list()
fi_xgb = fi_xgb.sort_values(by = "Importance",ascending = False)
for i in fi_xgb.index:
cols = fi_xgb.Feature[0:i+1]
models = XGBClassifier(seed=rnd,n_jobs = -1)
cv = cross_validate(models, X_train.loc[:,cols] ,y_train,cv = 3, n_jobs=-1, return_estimator=True)
tr = np.mean(cv['test_score'])
ts = cv["estimator"][0].score(X_test.loc[:,cols],y_test)
cv_list.append(tr)
test_list.append(ts)
if tr > xgb_t:
xgb_t = tr
names_xgb = cols.tolist()
xgb_s = ts
nof = i + 1
print("Optimum number of features: {}".format(nof))
print('Selected features are:',names_xgb,"\n")
print('Mean training accuracy with {} features: {:.6f}'.format(nof, xgb_t))
print("Test accuracy with {} features: {:.6f}".format(nof, xgb_s))
X_train_xgb = X_train.loc[:,names_xgb]
X_test_xgb = X_test.loc[:,names_xgb]
param_xgb = {
"eta" : [0.01, 0.1, 0.2] ,
"max_depth" : [ 3, 10,50],
"min_child_weight" : [ 1, 4, 7 ],
"gamma" : [ 0.0, 0.1, 0.5 ,1],
"reg_lambda": [0.1,1,10]}
xgb = XGBClassifier(seed=rnd,n_jobs =-1)
clf_xgb = GridSearchCV(xgb, param_xgb, cv=3, n_jobs = -1)
clf_xgb.fit(X_train_xgb, y_train)
print("Tuned XGB Classification Parameters: {}".format(clf_xgb.best_params_))
print("Mean of the cv scores is {:.6f}".format(clf_xgb.best_score_))
print("Test Score {:.6f}".format(clf_xgb.score(X_test_xgb,y_test)))
print("Seconds used for refitting the best model on the train dataset: {:.6f}".format(clf_xgb.refit_time_))
y_pred_xgb = clf_xgb.predict(X_test_xgb)
#Confusion Matrix
conf = confusion_matrix(y_test,y_pred_xgb)
plt.figure(figsize=(16, 8))
ax = sns.heatmap(conf, annot=True,cmap='Blues',fmt='g') #annot=True to annotate cells
# labels, title and ticks
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
ax.set_title('Confusion Matrix of XGBoost',fontsize = 20).set_position([.5, 1.02])
ax.xaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
ax.yaxis.set_ticklabels(['Not Popular', 'Popular'],fontsize = 12)
plt.show()
xgb_precision, xgb_recall, _ = precision_recall_curve(y_test, y_pred_xgb)
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.figure(figsize=(14, 10))
plt.plot(xgb_recall, xgb_precision, color='red',
label='XGB' % roc_auc_xgb)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.xlabel('Recall',fontsize=18,labelpad =10)
plt.ylabel('Precision',fontsize=18)
plt.title('XGBClassifier Precision-Recall Curve',fontsize=22).set_position([.5, 1.02])
plt.show()
y_pred_prop = clf_xgb.predict_proba(X_test_xgb)[:,1]
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_prop)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
plt.figure(figsize=(14, 10))
plt.plot(fpr_xgb, tpr_xgb, color='darkorange',
label='ROC curve (area = %0.2f)' % roc_auc_xgb)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()
train_ac_xgb = clf_xgb.best_score_
accuracy_xgb = accuracy_score(y_test,y_pred_xgb)
precision_xgb = precision_score(y_test,y_pred_xgb)
recall_xgb = recall_score(y_test,y_pred_xgb)
f1_score_xgb = f1_score(y_test,y_pred_xgb)
time_xgb = clf_xgb.refit_time_
res_col = ['Train Accuracy','Test Accuracy','Precision','Recall','F1 Score','AUC','Fitting Time']
xgb_result = np.array([train_ac_xgb,accuracy_xgb,precision_xgb,recall_xgb,f1_score_xgb,roc_auc_xgb,time_xgb]).reshape(1,-1)
f_xgb=pd.DataFrame(xgb_result,index=['XGBoost'],columns = res_col)
f_xgb
final_results = f_dt.append([f_svc,f_rnd,f_log,f_xgb])
final_results.sort_values(by = "AUC",ascending=False)
plt.figure(figsize=(14, 10))
plt.plot(fpr_xgb, tpr_xgb, color='red',
label='ROC curve XGB (area = %0.2f)' % roc_auc_xgb)
plt.plot(fpr_rnd, tpr_rnd, color='green',
label='ROC curve Random Forest(area = %0.2f)' % roc_auc_rnd)
plt.plot(fpr_dt, tpr_dt, color='purple',
label='ROC curve Decision Tree(area = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate',fontsize=18,labelpad =10)
plt.ylabel('True Positive Rate',fontsize=18)
plt.title('Receiver Operating Characteristic',fontsize=22).set_position([.5, 1.02])
plt.legend(loc="lower right",fontsize=13)
plt.show()
We see that best performing algorithm is XGBoost classifier if we compare them by their AUC values. Besides that, Decision Tree algorithm performed very well. Sending 2 features with the highest importance into algorithm, Artist_Popularity and Artist_Followers , decreases the model complexity and increases the performance. Also, the Decision Tree model run significantly faster than other models. It seems all 3 model, XGBoost, Random Forest and Decision Tree performs well. More hyperparameter tuning can be done on these algoritms to increase performance.
2 main feature of the tracks are selected for all model as input. Artist_Popularity and Artist_Followers. Besides these, Album_Number is one the other features that affect tracks popularity. So, we can say that, sorting the songs on albums is important for popularity. The other features have less importance on the popularity, but it is differ from model to model. One of another conclusion of the project is selecting features for model is very important in Machine Learning application. Unnecessary features are highly decrease the model performance. Thus, feature selection and feature engineering plays important role in the model building.